#!/bin/bash
#SBATCH --job-name=run_eval_master_tr_343_vsmollm2_05b_captioning_1024
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1           # number of cores per tasks
#SBATCH --time 00:30:00              # maximum execution time (HH:MM:SS)
#SBATCH --output=/fsx/m4/experiments/local_experiment_dir/evals/run_eval_master/logs/%x_%j.out
#SBATCH --mail-type=FAIL,INVALID_DEPEND,REQUEUE,STAGE_OUT,TIME_LIMIT
#SBATCH --mail-user=hf-m4-hfc@huggingface.co
#SBATCH --partition=hopper-cpu
#SBATCH --no-requeue
#SBATCH --qos high

# ---------------------------------------------------------------------------------------------------------

set -x -e

source /fsx/m4/start-m4-user

# We are on an offline partition
# export HF_DATASETS_OFFLINE=1
# export TRANSFORMERS_OFFLINE=1

# ----------------- Auto-Workdir -----------------
if [ -n $SLURM_JOB_ID ];  then
    # check the original location through scontrol and $SLURM_JOB_ID
    SCRIPT_PATH=$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')
else
    # otherwise: started with bash. Get the real location.
    SCRIPT_PATH=$(realpath $0)
fi
SCRIPT_DIR=$(dirname ${SCRIPT_PATH})
WORKING_DIR=$(builtin cd $SCRIPT_DIR/../../../../; pwd)
echo "Working dir is: $WORKING_DIR"

cd $WORKING_DIR
pushd $WORKING_DIR

# ---------------------------------------------------------------------------------------------------------

# TRAINING

SWEEP=""  # Leave to the empty string "" when no sweep
RUN_NAME="tr_343_vsmollm2_05b"
EVAL_RUNNAME=$(basename "${SCRIPT_PATH}" .slurm)

# TASK VARIABLES

NUM_SHOTS="4"
SHOT_SELECTION_MODE="random"
DATASET_SPLIT="validation"
USE_SELECTED_PROMPT_TEMPLATE_IDS="true"
SCALE_UP_IMAGES="True"
IMAGE_SIZE_AFTER_SCALING="1024"

# The dev tasks
TASKS_TO_CHECK=(
    "CocoNewSplitsVLlama3ImageCaptioningInContextBleuCiderMeteorRouge"
)

# ---------------------------------------------------------------------------------------------------------

# MOST IMPORTANT VARIABLES TO TWEAK

SHOW_GPU_MEM_UTIL="False"  # Useful to debug and find the best configuration

TYPE_SHARDING="ZeRO-3"  # "Zero-3", "DDP"
TYPE_GPU="a100"  # "a100", "v100"
NUM_NODES="1" # Doesn't currently work with number > 1
NUM_GPUS_PER_NODE="8"  # max is 8 for a100, 4 for v100. min((NUM_CPUS_PER_TASK * max_num_gpus_per_node) / max_num_cpus_per_node, NUM_GPUS_PER_NODE) GPUs are automatically allocated
MODEL_PRECISION="bf16"  # "fp32", "bf16", "fp16"
BATCH_SIZE="16"
NUM_PROCESSES=1 #$((NUM_GPUS_PER_NODE * NUM_NODES))
BATCH_SIZE_PER_GPU=$((BATCH_SIZE / NUM_PROCESSES))
echo "BATCH_SIZE_PER_GPU: $BATCH_SIZE_PER_GPU"

NUM_HOURS="6"

# ---------------------------------------------------------------------------------------------------------

# MODEL GENERATION PARAMETERS

NUM_BEAMS="1"
# NO_REPEAT_NGRAM_SIZE="0"
MAX_NEW_TOKENS="40"

# ---------------------------------------------------------------------------------------------------------

# VARIABLES THAT SHOULDN'T NEED TO BE TOUCHED

TOKENIZER_USE_FAST="True"  # False for OPT-1.3B (True when https://github.com/huggingface/transformers/pull/20823 is merged), True for OPT-13B or other LM backbones

EVALUATION_VERSION="v2"

# EVALUATION_LOCAL_DATASETS="/gpfsscratch/rech/cnw/commun/local_datasets/"

CHECKPOINTS_DIR="/scratch/m4/${RUN_NAME}"

EVALUATION_JSONL_FILE="/fsx/m4/experiments/local_experiment_dir/evals/results/${RUN_NAME}_evaluations.jsonl"

SAVE_DIR_CONFIGS="/fsx/m4/experiments/local_experiment_dir/evals/evaluation_configs/${RUN_NAME}"
mkdir -p $SAVE_DIR_CONFIGS
ACCELERATE_CONFIG_FILE="${SAVE_DIR_CONFIGS}/${SLURM_JOB_ID}_accelerate_config.yaml.autogenerated"
DEEPSPEED_CONFIG_FILE="${SAVE_DIR_CONFIGS}/${SLURM_JOB_ID}_ds_config.json.autogenerated"

CONDA_ENV_NAME="shared-m4-2024-05-28-copy3"
conda activate $CONDA_ENV_NAME

# ---------------------------------------------------------------------------------------------------------

# ESTABLISHING THE REMAINING EVALUATIONS TO BE DONE

DIR_MODEL_CHECKPOINTS=()
# CHANGE ME
for ((i=22500; i<=25000; i+=1000)); do
  directory="$CHECKPOINTS_DIR/opt_step-${i}/unwrapped_model"
  DIR_MODEL_CHECKPOINTS+=("$directory")
done


# ---------------------------------------------------------------------------------------------------------

# AUTOMATICALLY FILLED VARIABLES

if [ $TYPE_GPU == "v100" ]; then
    NUM_CPUS_PER_GPU=10
elif [ $TYPE_GPU == "a100" ]; then
    NUM_CPUS_PER_GPU=12
else
    echo "Unknown type of GPU. Exiting."
    exit
fi
NUM_CPUS_PER_TASK=$((NUM_CPUS_PER_GPU*NUM_GPUS_PER_NODE))

if [ $TYPE_SHARDING == "ZeRO-3" ]; then
    DEEPSPEED_CONFIG="
  deepspeed_multinode_launcher: standard
  deepspeed_config_file: $DEEPSPEED_CONFIG_FILE
  zero3_init_flag: true"
    DISTRIBUTED_TYPE="DEEPSPEED"
elif [ $TYPE_SHARDING == "DDP" ]; then
    DEEPSPEED_CONFIG="{}"
    DISTRIBUTED_TYPE="MULTI_GPU"
else
    echo "Unknown sharding type. Exiting."
    exit
fi

if [ "$MODEL_PRECISION" == "fp32" ]; then
    ENABLE_FP16="false"
    ENABLE_BF16="false"
elif [ "$MODEL_PRECISION" == "fp16" ]; then
    ENABLE_FP16="true"
    ENABLE_BF16="false"
elif [ "$MODEL_PRECISION" == "bf16" ]; then
    ENABLE_FP16="false"
    ENABLE_BF16="true"
fi

# Auto-generate the DS config
cat << EOT > $DEEPSPEED_CONFIG_FILE
{
    "zero_optimization": {
        "stage": 3,
        "offload_param": {
            "device": "none"
        }
    },
    "fp16": {
        "enabled": $ENABLE_FP16
    },
    "bf16": {
        "enabled": $ENABLE_BF16
    },
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto"
}
EOT

cat << EOT > $ACCELERATE_CONFIG_FILE
# WARNING: do not edit this file as this is an slurm-auto-generated file
compute_environment: LOCAL_MACHINE
deepspeed_config:
  deepspeed_multinode_launcher: standard
  deepspeed_config_file: $DEEPSPEED_CONFIG_FILE
  zero3_init_flag: true
distributed_type: DEEPSPEED
fsdp_config: {}
machine_rank: 0
main_process_ip: $MASTER_ADDR
main_process_port: $MASTER_PORT
main_training_function: main
num_machines: $NUM_NODES
num_processes: $NUM_PROCESSES
use_cpu: false
EOT

if [[ ! -f $ACCELERATE_CONFIG_FILE ]] ; then
    echo "File $ACCELERATE_CONFIG_FILE is not there, aborting."
    exit
fi

if [ $TYPE_GPU == "v100" ]; then
    CONSTRAINT="v100-32g"
    ACCOUNT="cnw@v100"
elif [ $TYPE_GPU == "a100" ]; then
    CONSTRAINT="a100"
    ACCOUNT="cnw@a100"
else
    echo "Unknown type of GPU. Exiting."
    exit
fi

# if (("$NUM_NODES" > 1)); then
#     echo "NUM_NODES > 1 is not yet supported in the evaluation pipeline. Exiting."
#     exit
# fi
joinByChar() {
  local IFS="$1"
  shift
  echo "$*"
}


TASKS_TO_DO=()
DIR_CHECKPOINTS=()
function elements_found_on_same_line {
elements=("$@")
document="${elements[-1]}"
unset elements[-1]

while read line; do
    all_found=true
    for element in "${elements[@]}"; do
    if ! echo "$line" | grep -qw "$element"; then
        all_found=false
        break
    fi
    done

    if $all_found; then
    echo 1
    return
    fi
done < "$document"
echo 0
}

# Here we will retrieve all the evaluations that have not been done
for step in ${DIR_MODEL_CHECKPOINTS[@]}
do
    CHECKPOINT_TASKS_TO_DO=()
    for task in ${TASKS_TO_CHECK[@]}
    do
        found_eval="0"
        if test -f "$EVALUATION_JSONL_FILE"; then
            eval_check=$(grep "\"task\": \"$task\"" $EVALUATION_JSONL_FILE \
                | grep $step \
                | grep "\"num_shots\": ${NUM_SHOTS}" \
                | grep "\"shot_selection_mode\": \"${SHOT_SELECTION_MODE}\"" \
                | grep "\"num_beams\": ${NUM_BEAMS}" \
                | grep "\"scale_up_images\": ${SCALE_UP_IMAGES}" \
                | grep "\"image_size_after_scaling\": ${IMAGE_SIZE_AFTER_SCALING}" \
                | grep "\"max_new_tokens\": ${MAX_NEW_TOKENS}" \
                | grep "\"dataset_split\": \"${DATASET_SPLIT}\"" \
                || echo "")
            if [[ ! -z "$eval_check" ]]; then
                found_eval="1"
            fi
        fi

        if [ "$found_eval" -eq "0" ]
        then
            echo "Eval not done before: $task with $step, $NUM_SHOTS shots and $SHOT_SELECTION_MODE shot selection and $DATASET_SPLIT split and templateid $PROMPT_TEMPLATE_ID"
            CHECKPOINT_TASKS_TO_DO+=($task)
        else
            echo "Eval done before: $task with $step, $NUM_SHOTS shots and $SHOT_SELECTION_MODE shot selection and $DATASET_SPLIT split and templateid $PROMPT_TEMPLATE_ID"
        fi
    done
    if [ ${#CHECKPOINT_TASKS_TO_DO[@]} -gt 0 ]; then
        STRING_CHECKPOINT_TASKS=$(joinByChar '/' ${CHECKPOINT_TASKS_TO_DO[@]})
        TASKS_TO_DO+=(${STRING_CHECKPOINT_TASKS})
        NUM_TASKS=$((NUM_TASKS + ${#CHECKPOINT_TASKS_TO_DO[@]}))
        DIR_CHECKPOINTS+=(${step::-16}) # len("/unwrapped_model") = 16
    fi
done

LEN=${#DIR_CHECKPOINTS[@]}
LEN=$((LEN-1))

echo "Num checkpointsto evaluate: $LEN"
echo "NUM_TASKS:$NUM_TASKS"
echo ${TASKS_TO_DO[@]}
echo ${DIR_CHECKPOINTS[@]}

if (("$LEN" < 0)); then
    echo "All checkpoints and tasks have been evaluated. Exiting."
    exit
fi

# Here we transform the array into a string where each element of the list is separated from the next and previous ones
# by |
# This change is necessary because it seems that it is not possible to pass an array variable to a slurm job with the
#--export command, but it is possible to pass a string
DIR_CHECKPOINTS_STRING=$(joinByChar '|' ${DIR_CHECKPOINTS[@]})
TASKS_TO_DO_STRING=$(joinByChar '|' ${TASKS_TO_DO[@]})
echo "TASKS_TO_DO_STRING: ${TASKS_TO_DO_STRING}"
# ---------------------------------------------------------------------------------------------------------

# LAUNCHING THE EVALUATIONS
ENV_VARIABLES=(
"ALL"
"CONDA_ENV_NAME=$CONDA_ENV_NAME"
"WORKING_DIR=$WORKING_DIR"
"DIR_CHECKPOINTS_STRING=$DIR_CHECKPOINTS_STRING"
"TASKS_TO_DO_STRING=$TASKS_TO_DO_STRING"
"EVALUATION_FILE=$EVALUATION_JSONL_FILE"
"NUM_SHOTS=$NUM_SHOTS"
"SHOT_SELECTION_MODE=$SHOT_SELECTION_MODE"
"NUM_BEAMS=$NUM_BEAMS"
"MAX_NEW_TOKENS=$MAX_NEW_TOKENS"
"BATCH_SIZE_PER_GPU=$BATCH_SIZE_PER_GPU"
"ACCELERATE_CONFIG_FILE=$ACCELERATE_CONFIG_FILE"
"NUM_PROCESSES=$NUM_PROCESSES"
"TOKENIZER_USE_FAST"=$TOKENIZER_USE_FAST
"EVALUATION_VERSION"=$EVALUATION_VERSION
"MODEL_PRECISION"=$MODEL_PRECISION
"SHOW_GPU_MEM_UTIL"=$SHOW_GPU_MEM_UTIL
"DATASET_SPLIT"=$DATASET_SPLIT
"PROMPT_TEMPLATE_ID"=$PROMPT_TEMPLATE_ID
"EVAL_RUNNAME=$EVAL_RUNNAME"
"USE_SELECTED_PROMPT_TEMPLATE_IDS=$USE_SELECTED_PROMPT_TEMPLATE_IDS"
"SCALE_UP_IMAGES=$SCALE_UP_IMAGES"
"IMAGE_SIZE_AFTER_SCALING=$IMAGE_SIZE_AFTER_SCALING"
)
EXPORTED_ENV_VARIABLES=$(joinByChar ',' ${ENV_VARIABLES[@]})

# We launch a job array in which each job will launch an evaluation
SBATCH_CMD="sbatch \
    --nodes=${NUM_NODES} \
    --gres=gpu:${NUM_GPUS_PER_NODE} \
    --cpus-per-task=${NUM_CPUS_PER_TASK} \
    --time ${NUM_HOURS}:00:00 \
    --array=0-$LEN%4 \
    --job-name=auto_eval_captioning_1024_${RUN_NAME}_${NUM_SHOTS}_shot_${SHOT_SELECTION_MODE}_split_${DATASET_SPLIT}_templateid_${PROMPT_TEMPLATE_ID} \
    --export=$EXPORTED_ENV_VARIABLES \
    --qos normal \
    experiments/evaluation/vloom/common/run_evals_multi_task_cluster_s3.slurm"
ID_JOB=$($SBATCH_CMD)
